Exercise 2

library(data.table)
## Warning: package 'data.table' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(mlr3)
## Warning: package 'mlr3' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.3
library(class)
## Warning: package 'class' was built under R version 4.2.3
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.2.3

Data Exploration and Data Preprocessing

Data Exploration

str(train_data)
## Classes 'data.table' and 'data.frame':   10486 obs. of  17 variables:
##  $ V1 : num  4271 4290 4269 4278 4305 ...
##  $ V2 : num  4268 4247 4268 4242 4278 ...
##  $ V3 : num  4024 3943 4002 3974 3991 ...
##  $ V4 : num  4634 4625 4620 4589 4613 ...
##  $ V5 : num  4124 4110 4119 4095 4131 ...
##  $ V6 : num  4349 4324 4342 4315 4342 ...
##  $ V7 : num  4220 4239 4218 4227 4234 ...
##  $ V8 : num  4100 4056 4060 4046 4079 ...
##  $ V9 : num  4209 4203 4208 4182 4211 ...
##  $ V10: num  4628 4605 4634 4602 4632 ...
##  $ V11: num  4192 4203 4196 4196 4207 ...
##  $ V12: num  4218 4230 4249 4223 4231 ...
##  $ V13: num  4290 4276 4267 4264 4280 ...
##  $ V14: num  4236 4165 4095 4085 4179 ...
##  $ V15: num  4567 4603 4586 4590 4619 ...
##  $ V16: num  4340 4340 4340 4343 4354 ...
##  $ V17: int  1 1 1 1 2 1 2 2 2 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
paged_table(head(train_data))
paged_table(head(test_data))
summary(train_data)
##        V1             V2             V3             V4               V5        
##  Min.   :1045   Min.   :1047   Min.   :2831   Min.   :  2762   Min.   :  2453  
##  1st Qu.:4280   1st Qu.:4251   1st Qu.:3991   1st Qu.:  4610   1st Qu.:  4108  
##  Median :4294   Median :4262   Median :4006   Median :  4618   Median :  4120  
##  Mean   :4301   Mean   :4264   Mean   :4010   Mean   :  4654   Mean   :  4183  
##  3rd Qu.:4311   3rd Qu.:4272   3rd Qu.:4024   3rd Qu.:  4627   3rd Qu.:  4132  
##  Max.   :7402   Max.   :5768   Max.   :7804   Max.   :362561   Max.   :642572  
##                                                                NA's   :15      
##        V6             V7             V8               V9            V10      
##  Min.   :2085   Min.   :1816   Min.   :  3584   Min.   :1353   Min.   :4555  
##  1st Qu.:4331   1st Qu.:4220   1st Qu.:  4058   1st Qu.:4190   1st Qu.:4604  
##  Median :4339   Median :4229   Median :  4070   Median :4200   Median :4614  
##  Mean   :4342   Mean   :4231   Mean   :  4127   Mean   :4201   Mean   :4616  
##  3rd Qu.:4348   3rd Qu.:4240   3rd Qu.:  4084   3rd Qu.:4210   3rd Qu.:4625  
##  Max.   :6472   Max.   :6683   Max.   :567191   Max.   :7148   Max.   :7259  
##                                NA's   :6                                     
##       V11            V12            V13            V14             V15         
##  Min.   :3282   Min.   :1816   Min.   :3095   Min.   :4057    Min.   :  79.93  
##  1st Qu.:4189   1st Qu.:4220   1st Qu.:4267   1st Qu.:4112    1st Qu.:4589.85  
##  Median :4201   Median :4229   Median :4277   Median :4153    Median :4603.20  
##  Mean   :4202   Mean   :4231   Mean   :4279   Mean   :4157    Mean   :4604.96  
##  3rd Qu.:4212   3rd Qu.:4240   3rd Qu.:4288   3rd Qu.:4205    3rd Qu.:4618.12  
##  Max.   :6142   Max.   :6683   Max.   :7007   Max.   :4255    Max.   :4831.53  
##                                               NA's   :10319                    
##       V16              V17       
##  Min.   :  1357   Min.   :1.000  
##  1st Qu.:  4341   1st Qu.:1.000  
##  Median :  4355   Median :1.000  
##  Mean   :  4440   Mean   :1.449  
##  3rd Qu.:  4373   3rd Qu.:2.000  
##  Max.   :715901   Max.   :2.000  
## 
summary(test_data)
##        V1               V2             V3             V4             V5      
##  Min.   :  4199   Min.   :4197   Min.   :3906   Min.   :4002   Min.   :4058  
##  1st Qu.:  4281   1st Qu.:4250   1st Qu.:3990   1st Qu.:4611   1st Qu.:4108  
##  Median :  4294   Median :4263   Median :4005   Median :4618   Median :4121  
##  Mean   :  4370   Mean   :4265   Mean   :4009   Mean   :4620   Mean   :4122  
##  3rd Qu.:  4312   3rd Qu.:4271   3rd Qu.:4023   3rd Qu.:4626   3rd Qu.:4131  
##  Max.   :309231   Max.   :6881   Max.   :5501   Max.   :4757   Max.   :5416  
##        V6             V7             V8             V9              V10      
##  Min.   :4309   Min.   :3915   Min.   :2086   Min.   :  4148   Min.   :4572  
##  1st Qu.:4332   1st Qu.:4221   1st Qu.:4058   1st Qu.:  4191   1st Qu.:4605  
##  Median :4339   Median :4230   Median :4070   Median :  4200   Median :4614  
##  Mean   :4342   Mean   :4231   Mean   :4073   Mean   :  4260   Mean   :4616  
##  3rd Qu.:4347   3rd Qu.:4240   3rd Qu.:4084   3rd Qu.:  4209   3rd Qu.:4624  
##  Max.   :5455   Max.   :4363   Max.   :4178   Max.   :265641   Max.   :4770  
##       V11            V12            V13            V14            V15        
##  Min.   :4106   Min.   :3915   Min.   :2258   Min.   :4000   Min.   :  4450  
##  1st Qu.:4190   1st Qu.:4221   1st Qu.:4268   1st Qu.:4050   1st Qu.:  4591  
##  Median :4201   Median :4230   Median :4277   Median :4099   Median :  4603  
##  Mean   :4203   Mean   :4231   Mean   :4279   Mean   :4100   Mean   :  4639  
##  3rd Qu.:4211   3rd Qu.:4240   3rd Qu.:4288   3rd Qu.:4149   3rd Qu.:  4618  
##  Max.   :6823   Max.   :4363   Max.   :4395   Max.   :4200   Max.   :152308  
##       V16            V17       
##  Min.   :4212   Min.   :1.000  
##  1st Qu.:4342   1st Qu.:1.000  
##  Median :4355   Median :1.000  
##  Mean   :4362   Mean   :1.448  
##  3rd Qu.:4373   3rd Qu.:2.000  
##  Max.   :5023   Max.   :2.000
colSums(is.na(train_data))
##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13 
##     0     0     0     0    15     0     0     6     0     0     0     0     0 
##   V14   V15   V16   V17 
## 10319     0     0     0
column_types <- sapply(train_data, class)
column_types
##        V1        V2        V3        V4        V5        V6        V7        V8 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##        V9       V10       V11       V12       V13       V14       V15       V16 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##       V17 
## "integer"

We can see the structure of the datasets and I called summary and I also check for types of data in each column so I can have better understanding of the data. I can see that in the trainset column V14 has 10319 NA’s, meaning that this column is useless. Some columns have very big max values. Now I will plot the data and see if I get some interesting insights

plots1 <- list()
for (i in colnames(train_data[, .(V1,V2,V3,V4,V5,V6)])) {
  plots1[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
                        type = "scatter", mode = "markers")
}
plots2 <- list()
for (i in colnames(train_data[, .(V7,V8,V9,V10,V11,V12)])) {
  plots2[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
                        type = "scatter", mode = "markers")
}
plots3 <- list()
for (i in colnames(train_data[, .(V13,V14,V15,V16)])) {
  plots3[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
                        type = "scatter", mode = "markers")
}

subplot(plots1, nrows =3 , titleY = T, margin = 0.05)
## Warning: Ignoring 15 observations
subplot(plots2, nrows =3 , titleY = T, margin = 0.05)
## Warning: Ignoring 6 observations
subplot(plots3, nrows =2 , titleY = T, margin = 0.05)
## Warning: Ignoring 10319 observations

On these plots we can see that we have few outlayers in each variable. These outliers can possibly be errors or anomalies. In the V4,V5,V8,V16 it is clear that the outliers must be error because its value is more than 300 000. On the others it is possible that the outliers are just anomalies. I will remove the error outliers in data cleaning. We can also see that in V14 there is significantly less values because there is more than 10000 na’s in V14 column. I will need to do something with this column in data cleaning.

print(paste("number of observations in training data:",nrow(train_data)))
## [1] "number of observations in training data: 10486"
print(paste("number of NA's in train data:",sum(is.na(train_data))))
## [1] "number of NA's in train data: 10340"
print(paste("number of observations in test data:",nrow(test_data)))
## [1] "number of observations in test data: 4494"
print(paste("number of NA's in test data:",sum(is.na(test_data))))
## [1] "number of NA's in test data: 0"
sum(is.na(train_data[,V14]))
## [1] 10319
train_data[, V14 := NULL]
test_data[,V14:=NULL]
print(paste("number of na's in training data after deleting V14 column:",sum(is.na(train_data))))
## [1] "number of na's in training data after deleting V14 column: 21"
na.omit(train_data)
##              V1       V2       V3       V4       V5       V6       V7       V8
##           <num>    <num>    <num>    <num>    <num>    <num>    <num>    <num>
##     1: 4271.243 4268.296 4023.855 4633.701 4123.519 4349.317 4219.938 4099.909
##     2: 4289.559 4246.557 3942.566 4625.247 4109.779 4324.064 4238.694 4055.565
##     3: 4269.343 4268.123 4002.423 4619.655 4119.264 4341.975 4218.083 4059.838
##     4: 4277.774 4241.873 3974.092 4588.951 4094.744 4315.371 4226.871 4045.703
##     5: 4305.415 4278.254 3990.935 4612.909 4131.057 4341.833 4234.395 4078.737
##    ---                                                                        
## 10461: 4278.239 4236.561 3984.617 4602.907 4106.490 4331.174 4224.743 4061.523
## 10462: 4242.714 4250.394 4001.537 4614.621 4114.467 4332.298 4230.501 4052.240
## 10463: 4283.146 4220.684 3959.518 4616.671 4087.848 4328.896 4240.378 4070.796
## 10464: 4302.162 4277.361 3998.248 4616.635 4116.586 4351.271 4227.369 4094.908
## 10465: 4342.272 4250.866 3984.202 4618.894 4118.444 4365.235 4256.441 4074.424
##              V9      V10      V11      V12      V13      V15      V16   V17
##           <num>    <num>    <num>    <num>    <num>    <num>    <num> <int>
##     1: 4208.919 4628.077 4191.718 4217.955 4289.988 4566.711 4340.008     1
##     2: 4203.122 4605.427 4203.027 4230.053 4275.575 4603.046 4339.954     1
##     3: 4207.642 4633.651 4195.585 4249.387 4266.658 4585.675 4340.059     1
##     4: 4181.712 4602.162 4196.075 4223.062 4263.628 4590.401 4342.895     1
##     5: 4210.574 4631.565 4206.704 4230.628 4279.925 4619.407 4354.185     2
##    ---                                                                     
## 10461: 4209.730 4622.018 4197.167 4230.838 4269.357 4588.413 4339.385     2
## 10462: 4180.039 4605.330 4172.590 4213.924 4246.346 4565.201 4296.446     1
## 10463: 4203.843 4614.622 4194.042 4233.497 4272.198 4581.772 4336.622     2
## 10464: 4201.546 4616.036 4213.862 4256.186 4280.359 4601.565 4374.190     2
## 10465: 4211.373 4603.222 4222.590 4251.372 4292.710 4631.441 4389.726     2

We can see that there are around 10500 total observations. There is total of 10340 NA’s in training data set. I already know that most of the NA’s (10319) are in the V14 column so i will just delete it. After deleting the column there is only 21 NA’s that I can omit. There are no NA’s in test data set

train_number_of_observations <- train_data[,.N]
test_number_of_observations <- test_data[,.N]
train_number_of_observations <- train_data[,.N]
test_number_of_observations <- test_data[,.N]

total_observations <- train_number_of_observations + test_number_of_observations
distribution_percentage_train <- train_number_of_observations / total_observations * 100
distribution_percentage_test <- test_number_of_observations / total_observations * 100

distribution_percentage_train
## [1] 70
distribution_percentage_test
## [1] 30

I calculated the train test split. The data are divided with 70/30 train test split.

Data Preprocessing

for (col in names(train_data[, !"V17"])) {
  train_data <- train_data[!(get(col) > 50000)]
}

colnames(train_data)[16] <- "eyes_status"   
colnames(test_data)[16] <- "eyes_status"

train_x <- train_data[,!"eyes_status"]
train_y <- train_data[,eyes_status]
test_x  <- test_data[,!"eyes_status"]
test_y  <- test_data[,eyes_status]
test_y <- factor(test_y, levels = c(1, 2), labels = c("1", "2"))

I removed rows with outliers that are likely to be error in measurement. I changed the name of column with closed and opened eyes, now the data table makes more sense. Separating independent variables and target variable form both training and test datasets. This prepares them for use in evaluating of clasification models. I am also setting test_y as factor with levels 1 and 2 for creating confusion matrixes later.

Clasifier Training

Training a classifier using Decision Trees.

Default Decision Tree

dtree_default <- rpart::rpart(eyes_status ~ ., method = "class", data = train_data)
printcp(dtree_default)
## 
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] V13 V15 V16 V3  V4  V8 
## 
## Root node error: 4701/10463 = 0.4493
## 
## n= 10463 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.098064      0   1.00000 1.00000 0.010823
## 2 0.057009      1   0.90194 0.92129 0.010717
## 3 0.032121      2   0.84493 0.85110 0.010574
## 4 0.028079      3   0.81281 0.83110 0.010525
## 5 0.019783      5   0.75665 0.76686 0.010340
## 6 0.014465      7   0.71708 0.73984 0.010250
## 7 0.013827      8   0.70262 0.73452 0.010231
## 8 0.012551      9   0.68879 0.72772 0.010207
## 9 0.010000     10   0.67624 0.70921 0.010139
plotcp(dtree_default)

rpart.plot(dtree_default, type = 2, extra = 101, fallen.leaves = F, main = "Classification Tree for eye status", tweak=1.2)

First default decision tree doesn’t perform well as we can see the xerror on level 9 is 0.72304. The decrease in the complexity parameter values with each split suggests that pruning the tree could improve its performance. In nine levels of CP I don’t see any overfitting meaning there is space for imporvement of the DecTree. Next I can grow the full tree and see if there will be overfitting. If yes then I can prune it afterwords.

Full Decision Tree

dtree_full <- rpart::rpart(eyes_status ~ ., method = "class", data = train_data,
                           control = rpart.control(minsplit = 1, cp = 0))
printcp(dtree_full)
## 
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class", 
##     control = rpart.control(minsplit = 1, cp = 0))
## 
## Variables actually used in tree construction:
##  [1] V1  V10 V11 V12 V13 V15 V16 V2  V3  V4  V5  V6  V7  V8  V9 
## 
## Root node error: 4701/10463 = 0.4493
## 
## n= 10463 
## 
##            CP nsplit rel error  xerror      xstd
## 1  0.09806424      0  1.000000 1.00000 0.0108234
## 2  0.05700915      1  0.901936 0.92023 0.0107153
## 3  0.03212083      2  0.844927 0.84876 0.0105687
## 4  0.02807913      3  0.812806 0.80664 0.0104595
## 5  0.01978302      5  0.756648 0.76431 0.0103321
## 6  0.01446501      7  0.717081 0.72814 0.0102087
## 7  0.01382685      8  0.702616 0.72155 0.0101847
## 8  0.01255052      9  0.688790 0.72049 0.0101808
## 9  0.00829611     10  0.676239 0.69730 0.0100925
## 10 0.00723250     11  0.667943 0.67560 0.0100045
## 11 0.00531802     13  0.653478 0.67305 0.0099938
## 12 0.00489258     16  0.637524 0.65667 0.0099234
## 13 0.00416933     18  0.627739 0.65369 0.0099103
## 14 0.00404169     28  0.568815 0.64327 0.0098635
## 15 0.00393533     29  0.564773 0.64199 0.0098577
## 16 0.00361625     31  0.556903 0.64050 0.0098508
## 17 0.00333262     32  0.553287 0.63178 0.0098104
## 18 0.00276537     36  0.535418 0.62263 0.0097671
## 19 0.00262356     38  0.529887 0.61072 0.0097090
## 20 0.00255265     41  0.522017 0.60987 0.0097048
## 21 0.00233993     42  0.519464 0.60200 0.0096655
## 22 0.00219811     45  0.512444 0.59689 0.0096395
## 23 0.00212721     48  0.505850 0.59519 0.0096308
## 24 0.00191449     52  0.497341 0.59264 0.0096176
## 25 0.00180813     59  0.482238 0.58668 0.0095866
## 26 0.00170177     61  0.478622 0.59094 0.0096088
## 27 0.00159541     62  0.476920 0.58030 0.0095529
## 28 0.00154223     69  0.464795 0.58030 0.0095529
## 29 0.00148904     73  0.458626 0.57903 0.0095461
## 30 0.00138268     75  0.455648 0.57924 0.0095472
## 31 0.00134723     82  0.445437 0.57371 0.0095175
## 32 0.00127632     85  0.441395 0.57456 0.0095221
## 33 0.00122314     89  0.436290 0.56882 0.0094908
## 34 0.00116996     99  0.420974 0.56882 0.0094908
## 35 0.00106360    105  0.413954 0.56414 0.0094650
## 36 0.00101042    119  0.399064 0.56265 0.0094568
## 37 0.00095724    124  0.393321 0.56371 0.0094627
## 38 0.00092179    142  0.375239 0.56520 0.0094709
## 39 0.00085088    150  0.367581 0.56520 0.0094709
## 40 0.00077998    183  0.338864 0.56286 0.0094579
## 41 0.00074452    193  0.330568 0.56265 0.0094568
## 42 0.00070907    207  0.320145 0.56265 0.0094568
## 43 0.00063816    210  0.318017 0.56137 0.0094497
## 44 0.00056726    272  0.278026 0.56265 0.0094568
## 45 0.00053180    280  0.273133 0.56562 0.0094733
## 46 0.00049635    301  0.261859 0.56626 0.0094768
## 47 0.00047862    319  0.250160 0.56626 0.0094768
## 48 0.00042544    328  0.245692 0.56328 0.0094603
## 49 0.00037226    476  0.181876 0.56477 0.0094686
## 50 0.00035453    488  0.176345 0.56562 0.0094733
## 51 0.00031908    515  0.166348 0.56839 0.0094885
## 52 0.00028363    614  0.132950 0.57116 0.0095036
## 53 0.00027350    639  0.125292 0.57413 0.0095198
## 54 0.00026590    646  0.123378 0.57435 0.0095209
## 55 0.00024817    650  0.122314 0.57435 0.0095209
## 56 0.00021272    664  0.118273 0.59179 0.0096132
## 57 0.00018613   1077  0.029994 0.59179 0.0096132
## 58 0.00017018   1085  0.028505 0.59221 0.0096154
## 59 0.00015954   1098  0.026165 0.59434 0.0096264
## 60 0.00014181   1104  0.025101 0.59689 0.0096395
## 61 0.00012763   1128  0.021698 0.59775 0.0096439
## 62 0.00012155   1138  0.020421 0.59838 0.0096471
## 63 0.00010636   1145  0.019570 0.60753 0.0096932
## 64 0.00000000   1328  0.000000 0.60753 0.0096932
plotcp(dtree_full)

rpart.plot(dtree_full, type = 2, extra = 101, fallen.leaves = F, tweak = 1.2, main = "Entire Tree for eye status")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

best_value <- which.min(dtree_full$cptable[, "xerror"])

Here I created full decision tree. We can see that the full tree is very complex 64 levels of CP and the tree plot is unreadable. I will find the best value of xerror and then prune the tree. Best value of xerror is 0.53818 and it’s on level 48. That means we need to prune the tree on level 48.

Pruned Decision Tree

best_cp_for_pruning <- dtree_full$cptable[best_value, "CP"]
dtree_pruned <- prune(dtree_full, cp = best_cp_for_pruning)
printcp(dtree_pruned)
## 
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class", 
##     control = rpart.control(minsplit = 1, cp = 0))
## 
## Variables actually used in tree construction:
##  [1] V1  V10 V11 V12 V13 V15 V16 V2  V3  V4  V5  V6  V7  V8  V9 
## 
## Root node error: 4701/10463 = 0.4493
## 
## n= 10463 
## 
##            CP nsplit rel error  xerror      xstd
## 1  0.09806424      0   1.00000 1.00000 0.0108234
## 2  0.05700915      1   0.90194 0.92023 0.0107153
## 3  0.03212083      2   0.84493 0.84876 0.0105687
## 4  0.02807913      3   0.81281 0.80664 0.0104595
## 5  0.01978302      5   0.75665 0.76431 0.0103321
## 6  0.01446501      7   0.71708 0.72814 0.0102087
## 7  0.01382685      8   0.70262 0.72155 0.0101847
## 8  0.01255052      9   0.68879 0.72049 0.0101808
## 9  0.00829611     10   0.67624 0.69730 0.0100925
## 10 0.00723250     11   0.66794 0.67560 0.0100045
## 11 0.00531802     13   0.65348 0.67305 0.0099938
## 12 0.00489258     16   0.63752 0.65667 0.0099234
## 13 0.00416933     18   0.62774 0.65369 0.0099103
## 14 0.00404169     28   0.56882 0.64327 0.0098635
## 15 0.00393533     29   0.56477 0.64199 0.0098577
## 16 0.00361625     31   0.55690 0.64050 0.0098508
## 17 0.00333262     32   0.55329 0.63178 0.0098104
## 18 0.00276537     36   0.53542 0.62263 0.0097671
## 19 0.00262356     38   0.52989 0.61072 0.0097090
## 20 0.00255265     41   0.52202 0.60987 0.0097048
## 21 0.00233993     42   0.51946 0.60200 0.0096655
## 22 0.00219811     45   0.51244 0.59689 0.0096395
## 23 0.00212721     48   0.50585 0.59519 0.0096308
## 24 0.00191449     52   0.49734 0.59264 0.0096176
## 25 0.00180813     59   0.48224 0.58668 0.0095866
## 26 0.00170177     61   0.47862 0.59094 0.0096088
## 27 0.00159541     62   0.47692 0.58030 0.0095529
## 28 0.00154223     69   0.46479 0.58030 0.0095529
## 29 0.00148904     73   0.45863 0.57903 0.0095461
## 30 0.00138268     75   0.45565 0.57924 0.0095472
## 31 0.00134723     82   0.44544 0.57371 0.0095175
## 32 0.00127632     85   0.44140 0.57456 0.0095221
## 33 0.00122314     89   0.43629 0.56882 0.0094908
## 34 0.00116996     99   0.42097 0.56882 0.0094908
## 35 0.00106360    105   0.41395 0.56414 0.0094650
## 36 0.00101042    119   0.39906 0.56265 0.0094568
## 37 0.00095724    124   0.39332 0.56371 0.0094627
## 38 0.00092179    142   0.37524 0.56520 0.0094709
## 39 0.00085088    150   0.36758 0.56520 0.0094709
## 40 0.00077998    183   0.33886 0.56286 0.0094579
## 41 0.00074452    193   0.33057 0.56265 0.0094568
## 42 0.00070907    207   0.32014 0.56265 0.0094568
## 43 0.00063816    210   0.31802 0.56137 0.0094497
rpart.plot(dtree_pruned, type = 2, extra = 101, fallen.leaves = F, tweak = 1.2, main = "Pruned Tree for eye status")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Here I created pruned decision tree on level 48. First I determined the best level for pruning and then I pruned the tree based on the level. We can see now that the final tree is simplier, prevents overfitting and performs better than the full tree.

Predicting on Decision Tree model

pred_y_dtree_default <- predict(dtree_default, newdata = test_x, type = "class")
pred_y_dtree_full <- predict(dtree_full, newdata = test_x, type = "class")
pred_y_dtree_pruned <- predict(dtree_pruned, newdata = test_x, type = "class")

default_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_default, reference = test_y, positive = "2", mode = "prec_recall")
full_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_full,test_y, positive = "2", mode = "prec_recall")
pruned_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_pruned, test_y, positive = "2", mode = "prec_recall")

default_decision_tree_confusion_m
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2087  944
##          2  393 1070
##                                           
##                Accuracy : 0.7025          
##                  95% CI : (0.6889, 0.7158)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3826          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##               Precision : 0.7314          
##                  Recall : 0.5313          
##                      F1 : 0.6155          
##              Prevalence : 0.4482          
##          Detection Rate : 0.2381          
##    Detection Prevalence : 0.3255          
##       Balanced Accuracy : 0.6864          
##                                           
##        'Positive' Class : 2               
## 
full_decision_tree_confusion_m
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2060  490
##          2  420 1524
##                                           
##                Accuracy : 0.7975          
##                  95% CI : (0.7855, 0.8092)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5893          
##                                           
##  Mcnemar's Test P-Value : 0.02218         
##                                           
##               Precision : 0.7840          
##                  Recall : 0.7567          
##                      F1 : 0.7701          
##              Prevalence : 0.4482          
##          Detection Rate : 0.3391          
##    Detection Prevalence : 0.4326          
##       Balanced Accuracy : 0.7937          
##                                           
##        'Positive' Class : 2               
## 
pruned_decision_tree_confusion_m
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2062  473
##          2  418 1541
##                                           
##                Accuracy : 0.8017          
##                  95% CI : (0.7898, 0.8133)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5981          
##                                           
##  Mcnemar's Test P-Value : 0.07044         
##                                           
##               Precision : 0.7866          
##                  Recall : 0.7651          
##                      F1 : 0.7757          
##              Prevalence : 0.4482          
##          Detection Rate : 0.3429          
##    Detection Prevalence : 0.4359          
##       Balanced Accuracy : 0.7983          
##                                           
##        'Positive' Class : 2               
## 

I am predicting on all 3 decision tree classifiers I created with positive class “2” = closed eyelids. Based on confusion matrixes and calculations of Accuracy, Precision, Recall, and F1-Score we can see that the pruned decision tree classification model perfoms the best out of these three models. The accuracy is 80.62% what is not best but it can be satisfactory in some cases.The same goes with Precision, Recall and F1 value. I saved the best confusion matrix as a variable so I can create final table

Training a classifier using SVM.

svm_model<- svm(eyes_status ~ ., 
                data = train_data, 
                type = "C-classification", 
                kernel = "linear",
                cost = 1,
                scale = FALSE)
svm_model
## 
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification", 
##     kernel = "linear", cost = 1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  5843
svm_model$rho
## [1] -5783.164

I created Linear SVM classification model based on train data. We can see that it has really big amount of support vectors:5843 Then we can see the indexes of the support vectors, SV coordinates and negative eyes_status intercept of the decision boundary.

pred_train <- predict(svm_model, train_data)
mean(pred_train == train_data$eyes_status)
## [1] 0.6246774
pred_test <- predict(svm_model, test_data)
linear_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
linear_svm_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 1928 1071
##          2  552  943
##                                           
##                Accuracy : 0.6389          
##                  95% CI : (0.6246, 0.6529)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2517          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7774          
##             Specificity : 0.4682          
##          Pos Pred Value : 0.6429          
##          Neg Pred Value : 0.6308          
##              Prevalence : 0.5518          
##          Detection Rate : 0.4290          
##    Detection Prevalence : 0.6673          
##       Balanced Accuracy : 0.6228          
##                                           
##        'Positive' Class : 1               
## 

We can see that the accuracy of this model is for the train set 62.46774% and for the test set 63.88518% what is bad and I wouldn’t use this model for predicting.

svm_model<- svm(eyes_status ~ ., 
                data = train_data, 
                type = "C-classification", 
                kernel = "polynomial",
                cost = 1,
                degree = 2,
                scale = FALSE)
svm_model
## 
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification", 
##     kernel = "polynomial", cost = 1, degree = 2, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  2 
##      coef.0:  0 
## 
## Number of Support Vectors:  5795

Here I train the model with polynomial kernel to see if there is difference in accuracy.

pred_train <- predict(svm_model, train_data)
mean(pred_train == train_data$eyes_status)
## [1] 0.636911
pred_test <- predict(svm_model, test_data)
poly_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
poly_svm_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 1984 1028
##          2  496  986
##                                           
##                Accuracy : 0.6609          
##                  95% CI : (0.6468, 0.6747)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2969          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.4896          
##          Pos Pred Value : 0.6587          
##          Neg Pred Value : 0.6653          
##              Prevalence : 0.5518          
##          Detection Rate : 0.4415          
##    Detection Prevalence : 0.6702          
##       Balanced Accuracy : 0.6448          
##                                           
##        'Positive' Class : 1               
## 

As the accuracy shows the polynomial kernel is little bit better than linear but it is still performing very bad.

svm_model<- svm(eyes_status ~ ., 
                data = train_data, 
                type = "C-classification", 
                kernel = "radial",
                cost = 1,
                degree = 2,
                scale = FALSE)
svm_model
## 
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification", 
##     kernel = "radial", cost = 1, degree = 2, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  10463
pred_test <- predict(svm_model, test_data)
radial_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
radial_svm_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2480 2014
##          2    0    0
##                                           
##                Accuracy : 0.5518          
##                  95% CI : (0.5372, 0.5665)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : 0.5062          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.5518          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.5518          
##          Detection Rate : 0.5518          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

We can see that I have got 100% accuracy for train data but that is because we trained the radial model on train_data. If we look on test_data accuracy we see that this model performs horribly.

Training a classifier using KNN.

training_pred <- list()
Kselection <- seq(1, 85, 2)

for (i in Kselection) {
  training_pred[[as.character(i)]] <- knn.cv(train = train_x,
                                             cl    = train_y,
                                             k     = i)
}
get_accuracy <- function(prediction, reference) {
  all_levels <- union(levels(prediction), levels(reference))
  prediction_factor <- factor(prediction, levels = all_levels)
  reference_factor <- factor(reference, levels = all_levels)
  confusion_matrix <- confusionMatrix(data = prediction_factor, reference = reference_factor)
  accuracy <- confusion_matrix$overall["Accuracy"]
  return(accuracy)
}
accuracies <- sapply(training_pred, get_accuracy, reference = train_y)
plot_ly(x = Kselection, y = accuracies, type = "scatter", mode = "line")
test_pred <- knn(train = train_x,
                 cl    = train_y,
                 test  = test_x,
                 k     = 7)
knn_matrix <- confusionMatrix(data = test_pred, reference = test_y)
knn_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2311  227
##          2  169 1787
##                                         
##                Accuracy : 0.9119        
##                  95% CI : (0.9032, 0.92)
##     No Information Rate : 0.5518        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8214        
##                                         
##  Mcnemar's Test P-Value : 0.004179      
##                                         
##             Sensitivity : 0.9319        
##             Specificity : 0.8873        
##          Pos Pred Value : 0.9106        
##          Neg Pred Value : 0.9136        
##              Prevalence : 0.5518        
##          Detection Rate : 0.5142        
##    Detection Prevalence : 0.5648        
##       Balanced Accuracy : 0.9096        
##                                         
##        'Positive' Class : 1             
## 

For the knn I am finding the best number of neighbours for the algorithm and ploting the accuracies. As I found out 7 and 9 neighbours gives the best accuracy so I trained the model with 7 neighbours for best accuracy of 91.14%

Training a classifier using Random Forest.

train_y <- as.factor(train_y)
rf.orig <- ranger(x = train_x, y = train_y)

confusionMatrix(data = rf.orig$predictions, reference = train_y,
                                 positive = "2", mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 5223 1060
##          2  539 3641
##                                          
##                Accuracy : 0.8472         
##                  95% CI : (0.8401, 0.854)
##     No Information Rate : 0.5507         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.688          
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##               Precision : 0.8711         
##                  Recall : 0.7745         
##                      F1 : 0.8200         
##              Prevalence : 0.4493         
##          Detection Rate : 0.3480         
##    Detection Prevalence : 0.3995         
##       Balanced Accuracy : 0.8405         
##                                          
##        'Positive' Class : 2              
## 
my_pred <- predict(object = rf.orig, data = test_x)
rf_matrix <- confusionMatrix(data = my_pred$predictions, reference = test_y, positive = "2", mode = "prec_recall")
rf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2333  323
##          2  147 1691
##                                           
##                Accuracy : 0.8954          
##                  95% CI : (0.8861, 0.9042)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7868          
##                                           
##  Mcnemar's Test P-Value : 6.907e-16       
##                                           
##               Precision : 0.9200          
##                  Recall : 0.8396          
##                      F1 : 0.8780          
##              Prevalence : 0.4482          
##          Detection Rate : 0.3763          
##    Detection Prevalence : 0.4090          
##       Balanced Accuracy : 0.8902          
##                                           
##        'Positive' Class : 2               
## 

We can see good accuracy precision recall and f1 score on both train and test set. The model performs better on the test set and with accuracy 91.77% is the best model so far. We could also rebalance the data.

Training a classifier using AdaBoost.

train_data$eyes_status <- as.factor(train_data$eyes_status)

ada_list<-c()
mfinal_values <- c(10, 50, 100, 200, 300, 400)
for (mfinal in mfinal_values) {
  model <- adabag::boosting(eyes_status ~ ., data = train_data, boos = TRUE, mfinal = mfinal)
  ada_list[[as.character(mfinal)]] <- model
}

accuracy_data <- data.frame(MFinal = numeric(), Accuracy = numeric())

for (i in seq_along(ada_list)) {
  mfinal <- mfinal_values[i]
  model <- ada_list[[as.character(mfinal)]]
  predictions <- predict(model, newdata = test_data)
  accuracy <- mean(predictions$class == test_data$eyes_status)
  accuracy_data <- rbind(accuracy_data, data.frame(MFinal = mfinal, Accuracy = accuracy))
}

plot <- plot_ly(accuracy_data, type = "scatter", mode = "lines+markers", x = ~MFinal, y = ~Accuracy, name = "Accuracy")
plot
model <- ada_list[["300"]]
my_pred_adaboost <- predict(model, newdata = test_x)
adaboost_matrix <- confusionMatrix(as.factor(my_pred_adaboost$class), test_y, mode = "prec_recall", positive = "2")
adaboost_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2202  472
##          2  278 1542
##                                           
##                Accuracy : 0.8331          
##                  95% CI : (0.8219, 0.8439)
##     No Information Rate : 0.5518          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6595          
##                                           
##  Mcnemar's Test P-Value : 1.823e-12       
##                                           
##               Precision : 0.8473          
##                  Recall : 0.7656          
##                      F1 : 0.8044          
##              Prevalence : 0.4482          
##          Detection Rate : 0.3431          
##    Detection Prevalence : 0.4050          
##       Balanced Accuracy : 0.8268          
##                                           
##        'Positive' Class : 2               
## 

I trained adaboost classifier with different mfinal values (by tuning the mfinal parameter) and I found that mfinal = 400 had the best performence out of these I tried with accuracy of 83.16%.

Training a classifier using XGBoost.

train_x_matrix <- as.matrix(train_x)
train_y_binary <- ifelse(train_y == 1,  0, 1)
xgb <- xgboost::xgboost(data = train_x_matrix,
                        label = train_y_binary,
                        nrounds=3000, objective = "binary:logistic")
xgbtest_x <- as.matrix(test_x)
mypred <- predict(xgb,newdata = xgbtest_x)
xgbtest_y <- ifelse(test_y == 1, 0, 1)
my_pred_xgboost <- as.integer(mypred>0.5)

xgb_matrix <- confusionMatrix(as.factor(my_pred_xgboost),as.factor(xgbtest_y), mode="prec_recall", positive = "1")
xgb_matrix

For the xg boost model I needed to change the test_x and train_x data frames to matrix and the test_y and train_y to binary so the algorithm works. I ran the model with different amount of rounds and 3000 seemed to me that works the best. It has got really good accuracy of 91.34%

Sumarizing table of all models

method_algos <- c("Default_tree", "Full_tree", "Pruned_tree", "Linear_svm", "Polynomial_svm", "Radial_svm", "KNN", "Random_Forest", "Adaboost", "XGboost")

accuracy_of_all <- c(default_decision_tree_confusion_m$overall["Accuracy"], full_decision_tree_confusion_m$overall["Accuracy"], pruned_decision_tree_confusion_m$overall["Accuracy"], linear_svm_matrix$overall["Accuracy"], poly_svm_matrix$overall["Accuracy"], radial_svm_matrix$overall["Accuracy"], knn_matrix$overall["Accuracy"], rf_matrix$overall["Accuracy"], adaboost_matrix$overall["Accuracy"], xgb_matrix$overall["Accuracy"])

precision_of_all <- c(default_decision_tree_confusion_m$byClass["Precision"], full_decision_tree_confusion_m$byClass["Precision"], pruned_decision_tree_confusion_m$byClass["Precision"], linear_svm_matrix$byClass["Precision"], poly_svm_matrix$byClass["Precision"], radial_svm_matrix$byClass["Precision"], knn_matrix$byClass["Precision"], rf_matrix$byClass["Precision"], adaboost_matrix$byClass["Precision"], xgb_matrix$byClass["Precision"])

recall_of_all <- c(default_decision_tree_confusion_m$byClass["Recall"], full_decision_tree_confusion_m$byClass["Recall"], pruned_decision_tree_confusion_m$byClass["Recall"], linear_svm_matrix$byClass["Recall"], poly_svm_matrix$byClass["Recall"], radial_svm_matrix$byClass["Recall"], knn_matrix$byClass["Recall"], rf_matrix$byClass["Recall"], adaboost_matrix$byClass["Recall"], xgb_matrix$byClass["Recall"])

f1_of_all <- c(default_decision_tree_confusion_m$byClass["F1"], full_decision_tree_confusion_m$byClass["F1"], pruned_decision_tree_confusion_m$byClass["F1"], linear_svm_matrix$byClass["F1"], poly_svm_matrix$byClass["F1"], radial_svm_matrix$byClass["F1"], knn_matrix$byClass["F1"], rf_matrix$byClass["F1"], adaboost_matrix$byClass["F1"], xgb_matrix$byClass["F1"])
  
summary_df <- data.frame(Method = method_algos,Accuracy = accuracy_of_all, Precision = precision_of_all, Recall = recall_of_all, F1_score = f1_of_all)

paged_df <- paged_table(summary_df)
paged_df
plot_ly(summary_df, type = "scatter", mode = "lines+markers", y = ~Accuracy, x = seq(1, nrow(summary_df)), name = "Accuracy") |>
  add_trace(y = ~Precision, x = seq(1, nrow(summary_df)), name = "Precision") |>
  add_trace(y = ~Recall, x = seq(1, nrow(summary_df)), name = "Recall") |>
  add_trace(y = ~F1_score, x = seq(1, nrow(summary_df)), name = "F1_score")|>
  layout(xaxis = list(tickmode = "array", tickvals = seq(1, nrow(summary_df)), ticktext = summary_df$Method),
  title = "Summary Plot of precisions for all approaches", yaxis = list(title = 'Values'))

I created a plot comparing all the approaches that I trained my data for. We can see that the best performing model is knn with all the measures above 90 percent. The worst performing approaches for this type of classification are all 3 types of support vector machines.